package cn.minimelon.solon.service.analyze.impl;

import cn.hutool.core.util.StrUtil;
import cn.minimelon.solon.service.analyze.SimilarityService;
import lombok.extern.slf4j.Slf4j;
import org.noear.solon.annotation.Component;

import java.io.UnsupportedEncodingException;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;

@Slf4j
@Component
public class SimilarityServiceImpl implements SimilarityService {
    @Override
    public double calculateScore(String doc1, String doc2) {
        // 去掉空格，忽略大小写
        doc1 = StrUtil.trim(doc1).replace(" ", "").toLowerCase();
        doc2 = StrUtil.trim(doc2).replace(" ", "").toLowerCase();
        double score = getSimilarity(doc1, doc2);
        if (score == 1.0) {
            int doc1Length = doc1.length();
            int doc2Length = doc2.length();
            if (doc1Length != doc2Length) {
                double value = Math.abs(doc2Length - doc1Length);
                score = 1 - value / (doc2Length * doc1Length);
            }
        }
        return score;
    }

    private double getSimilarity(String doc1, String doc2) {
        if (doc1 != null && doc1.trim().length() > 0 && doc2 != null
                && doc2.trim().length() > 0) {

            Map<Integer, int[]> algorithmMap = new HashMap<>();

            // 将两个字符串中的中文字符以及出现的总数封装到，AlgorithmMap中
            buildAlgorithmMap(algorithmMap, doc1, 0);
            buildAlgorithmMap(algorithmMap, doc2, 1);

            Iterator<Integer> iterator = algorithmMap.keySet().iterator();
            double sqDoc1 = 0;
            double sqDoc2 = 0;
            double denominator = 0;
            while (iterator.hasNext()) {
                int[] c = algorithmMap.get(iterator.next());
                denominator += c[0] * c[1];
                sqDoc1 += c[0] * c[0];
                sqDoc2 += c[1] * c[1];
            }
            return denominator / Math.sqrt(sqDoc1 * sqDoc2);
        } else {
            throw new NullPointerException(" the Document is null or have not cahrs!!");
        }
    }

    /**
     *
     * @param algorithmMap 向量集合
     * @param doc 原文本
     * @param idx 存储向量轴 0 或者 1
     */
    private void buildAlgorithmMap(Map<Integer, int[]> algorithmMap, String doc, int idx) {
        for (int i = 0; i < doc.length(); i++) {
            char d2 = doc.charAt(i);
            int charIndex = getCharIndex(d2);
            if (charIndex != -1) {
                int[] fq = algorithmMap.get(charIndex);
                if (fq != null && fq.length == 2) {
                    fq[idx]++;
                } else {
                    fq = new int[2];
                    fq[idx] = 1;
                    algorithmMap.put(charIndex, fq);
                }
            }
        }
    }

    private static int getCharIndex(char d2) {
        return isHanZi(d2) ? getGB2312Id(d2) : getLetterId(d2);
    }

    private static int getLetterId(char ch) {
        return ch;
    }

    private static boolean isHanZi(char ch) {
        return (ch >= 0x4E00 && ch <= 0x9FA5);
    }

    private static short getGB2312Id(char ch) {
        try {
            byte[] buffer = Character.toString(ch).getBytes("GB2312");
            if (buffer.length != 2) {
                // 正常情况下buffer应该是两个字节，否则说明ch不属于GB2312编码，故返回'?'，此时说明不认识该字符
                return -1;
            }
            int b0 = (buffer[0] & 0x0FF) - 161; // 编码从A1开始，因此减去0xA1=161
            int b1 = (buffer[1] & 0x0FF) - 161; // 第一个字符和最后一个字符没有汉字，因此每个区只收16*6-2=94个汉字
            return (short) (b0 * 94 + b1);
        } catch (UnsupportedEncodingException e) {
            e.printStackTrace();
        }
        return -1;
    }
}
